# This dataset has varriables from 1,000 unconventional wells including well average porosity, log transform 
# of permeability (to linearize the relationships with other variables), accoustic impedance (kg/m2s*10^6), brittness ratio (%),
# total organic carbon (%), vitrinite reflectance (%), and production (MCFPD)

# Load  required libraries
library(plyr); library(scales)
library(ggplot2); library(lattice); library(corrplot)                              
## corrplot 0.84 loaded
library(tree); library(gbm); library(glmnet); library(randomForest); library(boot)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
## Loaded gbm 2.1.5
## Loading required package: Matrix
## Loaded glmnet 3.0-2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
# Read the data from .CSV
DF0 <- read.table("mv_unconv.csv", as.is = TRUE)    #import csv (str character)
rock.data <- read.csv(text = DF0[[1]])              #fix data type to numeric
rock.data <- rock.data[,-1]                         #remove first column, WellIndex

# Attach the data
attach(rock.data)

# Visualize the data
head(rock.data); tail(rock.data)
##     Por LogPerm   AI Brittle  TOC   VR Production
## 1 15.91    1.67 3.06   14.05 1.36 1.85    177.382
## 2 15.34    1.65 2.60   31.88 1.37 1.79   1479.768
## 3 20.45    2.02 3.13   63.67 1.79 2.53   4421.222
## 4 11.95    1.14 3.90   58.81 0.40 2.03   1488.318
## 5 19.53    1.83 2.57   43.75 1.40 2.11   5261.095
## 6 19.47    2.04 2.73   54.37 1.42 2.12   5497.006
##        Por LogPerm   AI Brittle   TOC   VR Production
## 995  11.95    1.14 2.97   67.18  0.80 2.06   931.6109
## 996  17.99    2.29 3.38   44.32  0.98 2.08  4211.5278
## 997  12.12    0.82 3.52   57.07 -0.04 1.73  1560.3337
## 998  15.55    1.50 2.48   58.25  1.89 2.35  2858.1805
## 999  20.89    2.02 3.23   46.17  1.71 2.27  6934.5763
## 1000 15.74    1.31 2.37   73.08  1.24 2.06   963.6758
# Summary statistics for each column
summary(rock.data)                                
##       Por           LogPerm            AI           Brittle      
##  Min.   : 5.40   Min.   :0.120   Min.   :0.960   Min.   :-10.50  
##  1st Qu.:12.86   1st Qu.:1.130   1st Qu.:2.578   1st Qu.: 39.72  
##  Median :14.98   Median :1.390   Median :3.010   Median : 49.68  
##  Mean   :14.95   Mean   :1.399   Mean   :2.983   Mean   : 49.72  
##  3rd Qu.:17.08   3rd Qu.:1.680   3rd Qu.:3.360   3rd Qu.: 59.17  
##  Max.   :24.65   Max.   :2.580   Max.   :4.700   Max.   : 93.47  
##       TOC               VR          Production       
##  Min.   :-0.260   Min.   :0.900   Min.   :    2.714  
##  1st Qu.: 0.640   1st Qu.:1.810   1st Qu.: 1191.370  
##  Median : 0.995   Median :2.000   Median : 1976.488  
##  Mean   : 1.004   Mean   :1.991   Mean   : 2247.296  
##  3rd Qu.: 1.360   3rd Qu.:2.172   3rd Qu.: 3023.594  
##  Max.   : 2.710   Max.   :2.900   Max.   :12568.644
# Correlation matrixand plot
cor_matrix <- round(cor(rock.data),3); print(cor_matrix)
##               Por LogPerm     AI Brittle    TOC    VR Production
## Por         1.000   0.808 -0.506  -0.255  0.714 0.085      0.687
## LogPerm     0.808   1.000 -0.325  -0.152  0.507 0.046      0.569
## AI         -0.506  -0.325  1.000   0.170 -0.549 0.487     -0.327
## Brittle    -0.255  -0.152  0.170   1.000 -0.246 0.296     -0.071
## TOC         0.714   0.507 -0.549  -0.246  1.000 0.306      0.504
## VR          0.085   0.046  0.487   0.296  0.306 1.000      0.142
## Production  0.687   0.569 -0.327  -0.071  0.504 0.142      1.000
corrplot(cor_matrix, method = "circle")      

# Scatterplot from Lattice
splom(rock.data,col=rgb(0,0,0,50,maxColorValue=440), pch=19,main = "Unconventional Dataset") 

# Comparison of Production against Porosity and Brittleness
par(mfrow=c(1,2))
plot(Por,Production, main="Production vs. Porosity", col=alpha("black",0.5),
     xlab="Porosity (%)", ylab="Production (MCF/d", pch=19)
plot(Brittle,Production, main="Production vs. Brittleness", col=alpha("black",0.5),
     xlab="Brittleness (%)", ylab="Production (MCF/d",pch=19)

par(mfrow=c(1,1))
#these two variables show highest and lowest correlation on the correlation matrix, respectively

# Comparison of Porosity and Brittleness
prod.deciles <- quantile(Production, 0:10/10); prod.deciles
##           0%          10%          20%          30%          40%          50% 
##     2.713535   635.029558  1040.736163  1333.724569  1653.544353  1976.487820 
##          60%          70%          80%          90%         100% 
##  2370.192510  2807.014175  3355.907657  4144.058669 12568.644130
cut.prod <- cut(Production, prod.deciles, include.lowest=TRUE)
par(mfrow=c(1,2))
plot(Por, Brittle, col=grey(10:2/11)[cut.prod], pch=20, 
     xlab="Porosity (%)", ylab="Brittleness (%)", main="Production (MCF/d)")

# Separate data into Training and Testing Sets
train <- rock.data[sample(nrow(rock.data),500,replace=FALSE),]
train_id <- sample(seq_len(nrow(rock.data)), size=500, replace=FALSE)
train <- rock.data[train_id,]
test <- rock.data[-train_id,]

# Check properties of training set to ensure correctness
head(train)
##       Por LogPerm   AI Brittle  TOC   VR Production
## 669  9.98    0.93 4.11   66.51 0.17 2.50   703.2806
## 707 16.99    1.96 2.86   21.86 0.88 1.63   633.2486
## 492  9.57    1.02 3.44   59.36 0.53 2.14  1041.3394
## 275 17.49    1.20 2.20   22.34 1.06 1.37   717.0724
## 821 16.32    1.40 2.41   46.78 1.21 1.72  3457.7714
## 190 17.40    1.65 1.88   27.97 0.46 0.90  1301.3714
dim(train)
## [1] 500   7
# Plot training set and compare to original/complete data set
cut.train.prod <- cut(train$Production, prod.deciles, include.lowest=TRUE)
plot(train$Por,train$Brittle, col=grey(10:2/11)[cut.train.prod], pch=20,
     xlab="Porosity (%)", ylab="Brittleness (%)", main="Training - Production (MCF/d)")    

par(mfrow=c(1,1))

#####

# BOOTSTRAP 
porosity <- rock.data$Por
summary(porosity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.40   12.86   14.98   14.95   17.08   24.65
hist(porosity,main="Porosity (%)", freq=FALSE, ylim=c(0,.2))

plot(ecdf(porosity), main="Porosity(%)", xlab="Porosity (%)", ylab="Cumulative Probability")

calc_average <- function(d, i=c(1:n)){
  d2 <- d[i]
  return(mean(d2))
}
boot.por.avg <- boot(data=porosity, statistic=calc_average, R=50000)
hist(boot.por.avg$t, main="Bootstrap Porosity Average", freq=FALSE)

plot(ecdf(boot.por.avg$t), main="Bootstrap Porosity Average")

# DECISION TREES
# Design Decision Tree control parameters
tree.control <- tree.control(nobs=500, mincut=5, minsize=10, mindev=0.01)

# Decision Tree on training data
tree.prod <- tree(Production~., data=train, control=tree.control)
summary(tree.prod)
## 
## Regression tree:
## tree(formula = Production ~ ., data = train, control = tree.control)
## Variables actually used in tree construction:
## [1] "Por"     "Brittle"
## Number of terminal nodes:  13 
## Residual mean deviance:  222900 = 108500000 / 487 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2349.000  -263.500     3.618     0.000   315.900  2016.000
plot(tree.prod); text(tree.prod, pretty=0)

# Another way to visualize
plot(rock.data$Por,rock.data$Brittle, col=grey(10:2/11)[cut.prod], pch=20, 
     xlab="Porosity (%)", ylab="Brittleness (%)")
partition.tree(tree.prod, ordvars=c("Por","Brittle"), add=TRUE)

# Pruned Tree
cv.prod <- cv.tree(tree.prod, K=10)
plot(cv.prod$size, cv.prod$dev, type="b")      #looks like 6 nodes is good pruning

prune.prod <- prune.tree(tree.prod, best=6)

# Comparison of regular vs pruned trees
par(mfrow=c(2,2))
plot(tree.prod); text(tree.prod, pretty=0)
plot(prune.prod); text(prune.prod, pretty=0)
plot(rock.data$Por,rock.data$Brittle, col=grey(10:2/11)[cut.prod], pch=20, xlab="Porosity (%)", ylab="Brittleness (%)")
    partition.tree(tree.prod, ordvars=c("Por","Brittle"), add=TRUE)
plot(rock.data$Por,rock.data$Brittle, col=grey(10:2/11)[cut.prod], pch=20, xlab="Porosity (%)", ylab="Brittleness (%)")
    partition.tree(prune.prod, ordvars=c("Por","Brittle"), add=TRUE)

par(mfrow=c(1,1))
    
# Prediciton from pruned tree
yhat.prod <- predict(prune.prod, newdata=test)
plot(yhat.prod, test$Production)   #6 bins from the 6 terminal nodes
abline(0,1) 

tree.error <- mean((yhat.prod-test$Production)^2); tree.error
## [1] 693338
RMSE.tree <- sqrt(tree.error); RMSE.tree
## [1] 832.6692
var.prod <- var(test$Production); var.prod
## [1] 2316516
# LINEAR REGRESSION
lm.fit <- lm(Production ~., data=train); lm.fit
## 
## Call:
## lm(formula = Production ~ ., data = train)
## 
## Coefficients:
## (Intercept)          Por      LogPerm           AI      Brittle          TOC  
##   -3031.281      274.011      212.960     -276.285        2.182     -261.081  
##          VR  
##     903.304
lm.pred <- predict(lm.fit, test)
lm.error <- mean((test$Production - lm.pred)^2); lm.error
## [1] 1160302
# LASSO REGULARIZATION
x <- model.matrix(Production~., data=train)
y <- train$Production
lasso.fit <- glmnet(x,y,alpha=1)
x.test <- model.matrix(Production~., data=test)
lasso.pred <- predict(lasso.fit, s=0.01, newx=x.test)
lasso.error <- mean((test$Production - lasso.pred)^2); lasso.error
## [1] 1159825
# BOOSTING
power <- seq(from=-10,to=-0.2,by=0.1)
lambdas <- 10^power
length.lambdas <- length(lambdas)
train.error <- rep(NA,length.lambdas)
test.error <- rep(NA,length.lambdas)
for (i in 1:length.lambdas) {
  boost.Prod <- gbm(Production~.,data=train,distribution="gaussian",n.trees=1000,shrinkage=lambdas[i])
  train.pred <- predict(boost.Prod, train, n.trees=1000)
  train.error[i] <- mean((train$Production-train.pred)^2)
  test.pred <- predict(boost.Prod, test, n.trees=1000)
  test.error[i] <- mean((test$Production-test.pred)^2)
}
par(mfrow=c(1,2))
plot(lambdas,train.error,type="b",xlab="Shrinkage",ylab="Train MSE",col="darkgreen",main="Train MSE vs Shrinkage")
plot(lambdas,test.error,type="b",xlab="Shrinkage",ylab="Test MSE",col="darkblue",main="Test MSE vs Shrinkage")

par(mfrow=c(1,1))
# lambda-value for minimal test MSE
test.MSE.min <- min(test.error)
lambda.min.test.MSE <- lambdas[which.min(test.error)]
data.frame(lambda.min.test.MSE,test.MSE.min)
##   lambda.min.test.MSE test.MSE.min
## 1          0.02511886     325629.3
boost.error <- test.MSE.min
# Best variables for boosting
boost.best <- gbm(Production~., data=train, distribution="gaussian", 
                  n.trees=1000, shrinkage=lambdas[which.min(test.error)])
summary(boost.best)

##             var    rel.inf
## Por         Por 53.0448144
## Brittle Brittle 39.0289182
## TOC         TOC  4.3693123
## LogPerm LogPerm  2.5821361
## VR           VR  0.6244537
## AI           AI  0.3503652
# RANDOM FOREST
rf.Prod <- randomForest(Production~., data=train, ntree=500, mtry=length(rock.data)-1)
importance(rf.Prod)
##         IncNodePurity
## Por         557204732
## LogPerm       5758044
## AI            3848514
## Brittle     399223627
## TOC           4593537
## VR            3518796
varImpPlot(rf.Prod)

rf.predict <- predict(rf.Prod, test)
rf.error <- mean((test$Production - rf.predict)^2); rf.error
## [1] 94342.24
# Comparing MSE for all methods
scientific(data.frame(tree.error,boost.error,lm.error,lasso.error,rf.error), digits=2)
##  tree.error boost.error    lm.error lasso.error    rf.error 
##   "6.9e+05"   "3.3e+05"   "1.2e+06"   "1.2e+06"   "9.4e+04"
# PCA
PCA <- prcomp(rock.data[,-7], scale=TRUE)
PCA$center #the means substracted from variables
##      Por  LogPerm       AI  Brittle      TOC       VR 
## 14.95046  1.39888  2.98261 49.71948  1.00381  1.99117
PCA$scale  #the standardization factor
##        Por    LogPerm         AI    Brittle        TOC         VR 
##  3.0296343  0.4059657  0.5776287 15.0770064  0.5049778  0.3081940
PCA$rotation  #principal component loadings
##                 PC1       PC2          PC3         PC4         PC5        PC6
## Por     -0.55024315 0.1381835 -0.005010946 -0.22272928  0.78608255  0.1028522
## LogPerm -0.47410001 0.1617101 -0.036607962 -0.66875903 -0.53354673 -0.1258099
## AI       0.41512557 0.3930196  0.457582045 -0.36799338  0.04539624  0.5712662
## Brittle  0.22649612 0.3665224 -0.870138284 -0.09235154  0.03386977  0.2180463
## TOC     -0.49811711 0.2295692  0.035471200  0.54699223 -0.30608030  0.5522984
## VR       0.02666979 0.7831028  0.175654719  0.24495835  0.02256057 -0.5428356
# Plotting the PCA biplot
biplot(PCA, cex=0.8, scale=0, pch=21)

# Analyze for two first principal components
nComp <- 2
Xhat2 <- t(t(PCA$x[,1:nComp] %*% t(PCA$rotation[,1:nComp])) *PCA$scale + PCA$center)
plot(LogPerm~Por, Xhat2, main="Log Permeability vs Porosity (from PC1 and PC2)", pch=19, 
     xlim=c(5,25), ylim=c(0,3), xlab="Porosity (%)", ylab="Log Permeability", col=alpha("black",0.3))

theta <- seq(0,2*pi,length.out=1000)
circle <- data.frame(x=cos(theta),y=sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
loadings <- data.frame(PCA$rotation, .names=row.names(PCA$rotation))
p + geom_text(data=loadings, mapping=aes(x=PC1, y=PC2, label=.names,colour=.names)) + 
  coord_fixed(ratio=1) + labs(x="PC1", y="PC2")

# Variance explained
pr.var <- PCA$sdev^2
pve <- pr.var / sum(pr.var); pve
## [1] 0.470751886 0.249563778 0.142183967 0.106323785 0.022664180 0.008512404
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", 
     main="Principal Component vs. Variance Explained", pch=19, col="red", ylim=c(0,1))
points(cumsum(pve), xlab="Principal Component", ylim=c(0,1), 
       pch=19, col="blue"); mtext("Cumulative Proportion of Variance Explained", side = 4)
legend("right",inset=.02,title="Variance Explained by PC", c("Proportion","Cumulative"),fill=c("red","blue"),cex=0.8)

plot(PCA, type="l", main="Variance by Principal Component", ylim=c(0,5))

summary(PCA)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.6806 1.2237 0.9236 0.7987 0.36876 0.22600
## Proportion of Variance 0.4708 0.2496 0.1422 0.1063 0.02266 0.00851
## Cumulative Proportion  0.4708 0.7203 0.8625 0.9688 0.99149 1.00000
#ggplot for Porosity and Brittleness against Production
ggplot(data=rock.data,aes(x=Por,y=Brittle)) + geom_point(aes(color=Production)) + 
  ggtitle("Porosity vs. Brittleness \n for Unconventional Wells")